This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.

Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.

They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.

Data import

Import of the expression data and the N-responsive genes and regulators :

load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426   45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426  201
ALPHAS <- seq(0,1, by = 0.1)

# subset <- sample(genes, replace = F, size = 20)
subset <- genes

Is the mse variation worsened by unmapping prior values and expression profiles of TFs?

For Rfs

# lmses <- data.frame(row.names = subset)
# N <-100
# for(alpha in ALPHAS){
#   for(perm in 1:N){
#     lmses[,paste(as.character(alpha), perm, "true_data")] <- bRF_inference_MSE(counts, subset, tfs, alpha = alpha, nTrees = 2000,
#                              pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = F)
#   }
# 
#   for(perm in 1:N){
#     lmses[,paste(as.character(alpha), perm, "shuffled")] <- bRF_inference_MSE(counts, subset, tfs, alpha = alpha, nTrees = 2000,
#                            pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = T)
#   }
# }
# 
# save(lmses, file = "results/brf_perumtations_mse_all_genes.rdata")

# subset<-unique(rownames(lmses))
draw_gene <- function(gene){
  lmses[gene,] %>%
    gather() %>%
    separate(key, into = c("alpha", "rep", "MSEtype"), sep = " ") %>%
    ggplot(aes(x=alpha, y=value, group =interaction(rep, MSEtype), 
               color = MSEtype))+ggtitle(gene)+ylab("MSE/Var(gene)")+
    xlab("alpha")+
    geom_line()+ggtitle(gene)+theme_pubr(legend = "top")
}


draw_gene_mean_sd <- function(gene, title = NULL){
  data <- lmses[gene, ] %>%
    gather() %>%
    separate(key,
             into = c("alpha", "rep", "MSEtype"),
             sep = " ") %>%
    group_by(alpha, MSEtype) %>%
    mutate(mean_mse = mean(value, na.rm = T),
           sd_mse = sd(value, na.rm = T)) %>%
    ggplot(aes(
      x = as.numeric(alpha),
      y = value,
      color = MSEtype,
      fill = MSEtype
    )) +ggtitle(paste(gene, title))+ylab("MSE/Var(gene)")+
    geom_ribbon(aes(ymin = mean_mse - sd_mse , 
                    ymax = mean_mse + sd_mse  ), 
                alpha = .4)  +theme_pubr(legend = "top")+
    geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha")

}

get_diff_curves <- function(lmses){
  data <- lmses %>%
  rownames_to_column('gene') %>%
    reshape2::melt()%>%
    separate(variable,
             into = c("alpha", "rep", "MSEtype"),
             sep = " ") %>%
    group_by(gene, alpha, MSEtype) %>%
    mutate(mean_mse = mean(value, na.rm = T),
           sd_mse = sd(value, na.rm = T)) %>%
    dplyr::select(mean_mse, sd_mse, gene, alpha, MSEtype)%>%
    distinct() 
  
  data_true <- filter(data, MSEtype=="true_data")
  data_perm <- filter(data, MSEtype=="shuffled") 
  data_true$mean_mse_perm <- data_perm$mean_mse
  data_true$sd_mse_perm <- data_perm$sd_mse
  return(data_true %>%
    mutate(mean_mse_diff = (mean_mse-mean_mse_perm)/sd_mse_perm))
  # %>%
  #   ggplot(aes(x=as.numeric(alpha), y=mean_mse_diff, color = gene))+
  #   geom_line()
}


# for(gene in sample(genes,50, replace = F)){
#     print(draw_gene(gene)+draw_gene_mean_sd(gene))
# }

For lasso

lmses <- data.frame(row.names = subset)
N<-100
for(alpha in ALPHAS){
  # set.seed(121314)
  for(perm in 1:N){
    lmses[,paste(as.character(alpha), perm, "true_data")] <- LASSO.D3S_inference_MSE(counts, subset, tfs, alpha = alpha, N=100,
                             pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = F)

    lmses[,paste(as.character(alpha), perm, "shuffled")] <- LASSO.D3S_inference_MSE(counts, subset, tfs, alpha = alpha, N=100,
                           pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = T)
  }

}
save(lmses, file = "results/lasso_perumtations_mse_all_genes.rdata")

# load("results/lasso_perumtations_mse_all_genes.rdata")
# for(gene in subset){
#     print(draw_gene(gene) +draw_gene_mean_sd(gene))
# }

Clustering genes for RFs

Based on the difference curves between true and permuted data

load("results/brf_perumtations_mse_all_genes.rdata")

diffs <- get_diff_curves(lmses)
diffs %>%
  ggplot(aes(x=as.numeric(alpha), y=mean_mse_diff, group = gene))+
  geom_line(alpha = 0.2)

fractions_out <- diffs %>%
  mutate(diff_greater_than_sd = ifelse(abs(mean_mse_diff)>1, 1, 0)) %>%
  group_by(gene) %>%
  summarise(fraction_out = sum(diff_greater_than_sd)/11);fractions_out<-
  setNames(fractions_out$fraction_out, fractions_out$gene)

diff_curves <- diffs[c("gene", "alpha", "mean_mse_diff")] %>%
  spread(alpha, mean_mse_diff) %>%
  column_to_rownames("gene") %>%
  as.matrix()


diff_curves<-diff_curves[fractions_out[rownames(diff_curves)] > 0,]
  
cor_clust = function(x) hclust(as.dist(1-cor(t(x))), method = "average")

Heatmap(diff_curves, cluster_rows = cor_clust,
        cluster_columns = F, show_row_names = F)

clusters_rf <- cutree(cor_clust(diff_curves), k = 2)
table(clusters_rf)
## clusters_rf
##   1   2 
## 346 325
clusters_rf<- c(clusters_rf,setNames(rep("no diff", sum(fractions_out==0)),
                                     names(fractions_out[fractions_out==0])))
table(clusters_rf)
## clusters_rf
##       1       2 no diff 
##     346     325     755
save(clusters_rf, file = "results/clusters_mse_bRF_100permutations.rdata")

for(gene in sample(genes,40, replace = F)){
    print(draw_gene(gene)+
            draw_gene_mean_sd(gene, title = paste(clusters_rf[gene], round(fractions_out[gene], 4))))
}

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(rep(colnames(diff_curves),1))),
    annotation_name_side = "left")
# draw a heatmap of the genes mean_mse on real data
true_mse <- diffs[c("gene", "alpha", "mean_mse")] %>%
  spread(alpha, mean_mse) %>%
  column_to_rownames("gene") %>%
  as.matrix() 
  

true_mse_pos <- true_mse[names(clusters_rf[clusters_rf==2]),]
Heatmap((true_mse-rowMeans(true_mse))/matrixStats::rowSds(true_mse),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(
    clusters_rf = clusters_rf[rownames(true_mse)],
    col=list(clusters_rf= setNames(c("darkorange", "darkgreen", "lightgrey"), 
                                       nm = names(table(clusters_rf)))))

Heatmap((true_mse_pos-rowMeans(true_mse_pos))/matrixStats::rowSds(true_mse_pos),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(
    clusters_rf = clusters_rf[rownames(true_mse_pos)],
    col=list(clusters_rf= setNames(c("darkorange", "darkgreen", "lightgrey"), 
                                       nm = names(table(clusters_rf)))))

Clustering genes for LASSO

Based on the difference curves between true and permuted data

load("results/lasso_perumtations_mse_all_genes.rdata")
diffs <- get_diff_curves(lmses)
diffs %>%
  ggplot(aes(x=as.numeric(alpha), y=mean_mse_diff, group = gene))+
  geom_line(alpha = 0.2)

fractions_out <- diffs %>%
  mutate(diff_greater_than_sd = ifelse(abs(mean_mse_diff)>1, 1, 0)) %>%
  group_by(gene) %>%
  summarise(fraction_out = sum(diff_greater_than_sd)/11);fractions_out<-
  setNames(fractions_out$fraction_out, fractions_out$gene)

diff_curves <- diffs[c("gene", "alpha", "mean_mse_diff")] %>%
  spread(alpha, mean_mse_diff) %>%
  column_to_rownames("gene") %>%
  as.matrix()


diff_curves<-diff_curves[fractions_out[rownames(diff_curves)] > 0,]
  
cor_clust = function(x) hclust(as.dist(1-cor(t(x))), method = "average")

Heatmap(diff_curves, cluster_rows = cor_clust,
        cluster_columns = F, show_row_names = F)

clusters_lasso <- cutree(cor_clust(diff_curves), k = 2)
table(clusters_lasso)
## clusters_lasso
##   1   2 
## 325 467
clusters_lasso<- c(clusters_lasso,setNames(rep("no diff", sum(fractions_out==0)),
                                     names(fractions_out[fractions_out==0])))
table(clusters_lasso)
## clusters_lasso
##       1       2 no diff 
##     325     467     634
save(clusters_lasso, file = "results/clusters_mse_lasso_100permutations.rdata")

for(gene in sample(genes,20, replace = F)){
    print(draw_gene(gene)+
            draw_gene_mean_sd(gene, title = paste(clusters_lasso[gene], round(fractions_out[gene], 4))))
}

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(rep(colnames(diff_curves),1))),
    annotation_name_side = "left")
# draw a heatmap of the genes mean_mse on real data
true_mse <- diffs[c("gene", "alpha", "mean_mse")] %>%
  spread(alpha, mean_mse) %>%
  column_to_rownames("gene") %>%
  as.matrix() 
  

true_mse_pos <- true_mse[names(clusters_lasso[clusters_lasso==1]),]
Heatmap((true_mse-rowMeans(true_mse))/matrixStats::rowSds(true_mse),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(
    clusters_lasso = clusters_lasso[rownames(true_mse)],
    col=list(clusters_lasso= setNames(c( "darkgreen","darkorange", "lightgrey"), 
                                       nm = names(table(clusters_lasso)))))

Heatmap((true_mse_pos-rowMeans(true_mse_pos))/matrixStats::rowSds(true_mse_pos),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(
    clusters_lasso = clusters_lasso[rownames(true_mse_pos)],
    col=list(clusters_lasso= setNames(c("darkgreen","darkorange", "lightgrey"), 
                                       nm = names(table(clusters_lasso)))))

Functional study of those groups

load("rdata/pwm_prom_jaspar_dap.rdata")
load("rdata/gene_structure.rdata")

mean_expr <- rowMeans(counts)[genes]
var_expr <- matrixStats::rowSds(counts[genes,])*matrixStats::rowSds(counts[genes,])
pwm_prom_n_TFs <- pwm_prom[pwm_prom$TF %in% tfs,]

library(patchwork)

# to comment for new version where mse is already normalized per genes
# norm_mse <- exp(as.matrix(cbind(lmses, lmses_lasso)))/var_expr
genes_info <- data.frame(genes = genes, 
                         cluster_rf = clusters_rf[genes],
                         cluster_lasso = clusters_lasso[genes])

genes_info$is_tf <- genes %in% tfs
genes_info$var <- var_expr
genes_info$expr <- mean_expr
genes_info$min_mse <- matrixStats::rowMins(as.matrix(true_mse))
genes_info$nb_motifs <- table(pwm_prom$target)[genes]
genes_info$nb_motifs_n_tfs <- table(pwm_prom_n_TFs$target)[genes]

genes_info[,c("n_introns", "n_transcripts")] <- 
  gene_structure[match(genes_info$gene, gene_structure$gene),
                 c("n_introns", "n_transcripts")]

genes_info%>%
  ggplot(aes(x=cluster_rf, y=log(n_introns))) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of introns for RF groups")) +
genes_info%>%
  ggplot(aes(x=cluster_rf, y=n_transcripts)) + 
  geom_boxplot(width=0.1, fill = "white")+
  geom_violin(fill="darkblue", alpha=0.2)+
  ggtitle(("Number of transcripts for RF groups")) + 
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=cluster_rf, y=nb_motifs)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs in promoter for RF groups")) + 
  genes_info%>%
  ggplot(aes(x=cluster_rf, y=nb_motifs_n_tfs)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs of N-responsive TFs in promoter for RF groups")) + 
  stat_compare_means() + genes_info%>%
  ggplot(aes(x=cluster_rf, y=min_mse)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Min mse for RF groups")) + 
  stat_compare_means() 

genes_info%>%
  ggplot(aes(x=cluster_rf, y=var)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for RF groups")) + scale_y_log10()+
  stat_compare_means()+ genes_info%>%
  ggplot(aes(x=cluster_rf, y=expr)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for RF groups")) + scale_y_log10()+
  stat_compare_means()

genes_info %>%
  group_by(cluster_rf) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_rf, y=tf_frac, 
                          label = paste("n=",n))) + 
  geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.2) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in RF groups")+ylim(c(0,0.2))

genes_info%>%
  ggplot(aes(x=cluster_lasso, y=log(n_introns))) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of introns for LASSO groups")) +
genes_info%>%
  ggplot(aes(x=cluster_lasso, y=n_transcripts)) + 
  geom_boxplot(width=0.1, fill = "white")+
  geom_violin(fill="darkblue", alpha=0.2)+
  ggtitle(("Number of transcripts for LASSO groups")) + 
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=cluster_lasso, y=nb_motifs)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs in promoter for LASSO groups")) + 
  genes_info%>%
  ggplot(aes(x=cluster_lasso, y=nb_motifs_n_tfs)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs of N-responsive TFs in promoter for LASSO groups")) + 
  stat_compare_means() + genes_info%>%
  ggplot(aes(x=cluster_lasso, y=min_mse)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Min mse for LASSO groups")) + 
  stat_compare_means() 

genes_info%>%
  ggplot(aes(x=cluster_lasso, y=var)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for LASSO groups")) + scale_y_log10()+
  stat_compare_means()+ genes_info%>%
  ggplot(aes(x=cluster_lasso, y=expr)) + 
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for LASSO groups")) + scale_y_log10()+
  stat_compare_means()

genes_info %>%
  group_by(cluster_lasso) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_lasso, y=tf_frac, 
                          label = paste("n=",n))) + 
  geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.2) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in RF groups")+ylim(c(0,0.2))

# promoteurs enrichis en certains motifs de TFs?
known_tfs <- tfs[which(tfs %in% pwm_prom$TF)]

get_number_of_motifs_per_tfs <- function(genes){
  table(pwm_prom[pwm_prom$target %in% genes & pwm_prom$TF %in% tfs,"TF"])[known_tfs]
}

targets_per_pwm <- data.frame(row.names = known_tfs)
for(group in unique(clusters_rf)){
  targets_per_pwm[paste("lasso", group)] <- get_number_of_motifs_per_tfs(names(
    which(clusters_lasso == group)))/sum(clusters_lasso == group)
  targets_per_pwm[paste("rf", group)] <- get_number_of_motifs_per_tfs(names(
    which(clusters_rf == group)))/sum(clusters_rf == group)

}

enrichments_per_pwm <- data.frame(row.names = known_tfs)
n_genes <- length(genes)
for(group in unique(clusters_rf)){
  # number of motifs in all the genes
  n_targets_lasso_in_all <- get_number_of_motifs_per_tfs(genes)
  
  n_targets_lasso_in_group <- get_number_of_motifs_per_tfs(names(
    which(clusters_lasso == group)))
  n_group_lasso <- length(names(which(clusters_lasso == group)))
  
  n_targets_rf_in_group <- get_number_of_motifs_per_tfs(names(
    which(clusters_rf == group)))
  n_group_rf <- length(names(which(clusters_rf == group)))
  
  for(tf in known_tfs){
    
    # number of genes with that motif in all genes
    n_targets_in_all_tf <- n_targets_lasso_in_all[tf]
    
    # number of genes with that motif in the lasso group
    n_targets_lasso_in_group_tf <- n_targets_lasso_in_group[tf]
    p_lasso <- phyper(q=n_targets_lasso_in_group_tf-1,
           m=n_targets_in_all_tf, #white balls
           n=n_genes-n_targets_in_all_tf, # black balls
           k=n_group_lasso, lower.tail = FALSE)
    
     # number of genes with that motif in the rf group
    n_targets_rf_in_group_tf <- n_targets_rf_in_group[tf]
    p_rf <- phyper(q=n_targets_rf_in_group_tf-1,
           m=n_targets_in_all_tf, 
           n=n_genes-n_targets_in_all_tf,
           k=n_group_rf, lower.tail = FALSE)
    
    enrichments_per_pwm[tf, paste0(group, "lasso")]<- p_lasso
    enrichments_per_pwm[tf, paste0(group, "rf")]<- p_rf
  }
}

enrichments_per_pwm[enrichments_per_pwm<0.05] <- 0
enrichments_per_pwm[enrichments_per_pwm>=0.05] <- 1
Heatmap(enrichments_per_pwm, cluster_columns = F)

tfs_rf_pwm_pos <- rownames(enrichments_per_pwm[enrichments_per_pwm$`2rf`==0,])
tfs_lasso_pwm_pos <- rownames(enrichments_per_pwm[enrichments_per_pwm$`1lasso`==0,])
DIANE::get_gene_information(tfs_rf_pwm_pos, organism = "Arabidopsis thaliana")
DIANE::get_gene_information(tfs_lasso_pwm_pos, organism = "Arabidopsis thaliana")
DIANE::get_gene_information(intersect(tfs_rf_pwm_pos, tfs_lasso_pwm_pos ), organism = "Arabidopsis thaliana")
tfs_rf_pwm_bad <- rownames(enrichments_per_pwm[enrichments_per_pwm$`1rf`==0,])
tfs_lasso_pwm_bad <- rownames(enrichments_per_pwm[enrichments_per_pwm$`2lasso`==0,])
DIANE::get_gene_information(tfs_rf_pwm_bad, organism = "Arabidopsis thaliana")
DIANE::get_gene_information(tfs_lasso_pwm_bad, organism = "Arabidopsis thaliana")
tfs_rf_pwm_pretty_bad <- rownames(enrichments_per_pwm[enrichments_per_pwm$`no diffrf`==0,])
tfs_lasso_pwm_bad <- rownames(enrichments_per_pwm[enrichments_per_pwm$`no difflasso`==0,])
DIANE::get_gene_information(tfs_rf_pwm_pretty_bad, organism = "Arabidopsis thaliana")
DIANE::get_gene_information(tfs_lasso_pwm_bad, organism = "Arabidopsis thaliana")

Network ranks of the Tfs that have enriched motifs in some gene clusters

load("results/100_permutations_bRF_edges.rdata")
# 10 first replicates
edges <- edges[names(edges)[as.numeric(str_split_fixed(names(edges), '_', 5)[,4])<=10]]
net <- edges$bRF_1_trueData_1_0.005
# nodes <- 
get_nodes_relative_rank <- function(net, nodes){
  return(setNames(rank(table(net$from))[nodes]/length(unique(net$from)),nodes))
}

positive_genes <- names(clusters_rf[clusters_rf==2])
edges_positives <- lapply(edges, function(net){net[net$to %in% positive_genes,]})

negative_genes <- names(clusters_rf[clusters_rf==1])
edges_negatives <- lapply(edges, function(net){net[net$to %in% negative_genes,]})


relative_rank <- lapply(edges, get_nodes_relative_rank, tfs_rf_pwm_pos)
relative_rank_pos <- lapply(edges_positives, get_nodes_relative_rank, tfs_rf_pwm_pos)

data.frame(relative_rank) %>%
  rownames_to_column("gene") %>%
  reshape2::melt() %>%
  separate(variable, into = c("method", "alpha", "dataset", "rep", "density"), sep = '_', remove = F) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  ggplot(aes(x=alpha, y=value, color = dataset)) + 
  ggh4x::facet_nested_wrap(vars(density, gene), nest_line = T) + 
  geom_point() + geom_smooth() +
  theme(strip.background = element_blank(), axis.title.x = element_text(size = 22),
        title = element_text(size = 16), strip.text = element_text(size = 16), 
        legend.text = element_text(size = 15), axis.text = element_text(size = 12), 
        legend.position = 'left') + ggtitle("TFs with motifs enriched in positive genes promoters : relative out-degree rank")


data.frame(relative_rank_pos) %>%
  rownames_to_column("gene") %>%
  reshape2::melt() %>%
  separate(variable, into = c("method", "alpha", "dataset", "rep", "density"), sep = '_', remove = F) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  ggplot(aes(x=alpha, y=value, color = dataset)) + 
  ggh4x::facet_nested_wrap(vars(density, gene), nest_line = T) + 
  geom_point() + geom_smooth() +
  theme(strip.background = element_blank(), axis.title.x = element_text(size = 22),
        title = element_text(size = 16), strip.text = element_text(size = 16), 
        legend.text = element_text(size = 15), axis.text = element_text(size = 12), 
        legend.position = 'left') + ggtitle("TFs with motifs enriched in positive genes promoters : relative out-degree rank in positive subset")



relative_rank_bad <- lapply(edges, get_nodes_relative_rank, tfs_rf_pwm_bad)
relative_rank_bad_subset <- lapply(edges_negatives, get_nodes_relative_rank, tfs_rf_pwm_bad)

data.frame(relative_rank_bad) %>%
  rownames_to_column("gene") %>%
  reshape2::melt() %>%
  separate(variable, into = c("method", "alpha", "dataset", "rep", "density"), sep = '_', remove = F) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  ggplot(aes(x=alpha, y=value, color = dataset)) + 
  ggh4x::facet_nested_wrap(vars(density, gene), nest_line = T) + 
  geom_point() + geom_smooth() +
  theme(strip.background = element_blank(), axis.title.x = element_text(size = 22),
        title = element_text(size = 16), strip.text = element_text(size = 16), 
        legend.text = element_text(size = 15), axis.text = element_text(size = 12), 
        legend.position = 'left') + ggtitle("TFs with motifs enriched in negative genes promoters : relative out-degree rank")

data.frame(relative_rank_bad_subset) %>%
  rownames_to_column("gene") %>%
  reshape2::melt() %>%
  separate(variable, into = c("method", "alpha", "dataset", "rep", "density"), sep = '_', remove = F) %>%
  mutate(alpha = as.numeric(alpha)) %>%
  ggplot(aes(x=alpha, y=value, color = dataset)) + 
  ggh4x::facet_nested_wrap(vars(density, gene), nest_line = T) + 
  geom_point() + geom_smooth() +
  theme(strip.background = element_blank(), axis.title.x = element_text(size = 22),
        title = element_text(size = 16), strip.text = element_text(size = 16), 
        legend.text = element_text(size = 15), axis.text = element_text(size = 12), 
        legend.position = 'left') + ggtitle("TFs with motifs enriched in negative genes promoters : relative out-degree rank in negative subset")

Go enrichments

library(DIANE)
background <- convert_from_agi(genes)

for(group in unique(clusters_rf)){
  
  genes_i <- names(which(clusters_lasso == group))

  print(paste("LASSO", length(genes_i), "genes,", group, "\n"))
  genes_i <- convert_from_agi(genes_i)
  go_lasso <- enrich_go(genes_i, background)
  DIANE::draw_enrich_go(go_lasso)
  print(go_lasso)
  genes_i <- names(which(clusters_rf == group))
  print(paste("RF", length(genes_i), "genes, group", group))
  genes_i <- convert_from_agi(genes_i)
  go_rf <- enrich_go(genes_i, background)
  DIANE::draw_enrich_go(go_rf)
  print(go_rf)
}
## [1] "LASSO 325 genes, 1 \n"
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 lignes> (ou 'row.names' de longueur nulle)
## [1] "RF 346 genes, group 1"
##                    ID             Description GeneRatio BgRatio       pvalue
## GO:0006396 GO:0006396          RNA processing    37/334 86/1372 6.436356e-05
## GO:0034660 GO:0034660 ncRNA metabolic process    35/334 82/1372 1.259164e-04
##              p.adjust     qvalue
## GO:0006396 0.03355672 0.03273827
## GO:0034660 0.03355672 0.03273827
##                                                                                                                                                                           geneID
## GO:0006396 NA/ATRRP4/NA/NA/REB1/APUM23/RH29/RPL7A/NA/EMB2762/HD2D/AtPEIP1/NA/NA/EDA13/NA/NA/FNBP4/NA/NA/DMS7/AtTRM11/AtWTF1/MEE49/NA/NA/NA/NA/NA/NA/NA/TOZ/NA/NA/NA/RID2/AtRPP30
## GO:0034660             NA/ATRRP4/NA/NA/REB1/APUM23/RH29/RPL7A/CCG/NA/EMB2762/HD2D/AtPEIP1/NA/NA/EDA13/NA/NA/NA/NA/NA/DMS7/AtTRM11/MEE49/NA/NA/NA/NA/NA/NA/NA/TOZ/NA/RID2/AtRPP30
##            Count
## GO:0006396    37
## GO:0034660    35
## [1] "LASSO 467 genes, 2 \n"
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 lignes> (ou 'row.names' de longueur nulle)
## [1] "RF 325 genes, group 2"
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 lignes> (ou 'row.names' de longueur nulle)
## [1] "LASSO 634 genes, no diff \n"
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 lignes> (ou 'row.names' de longueur nulle)
## [1] "RF 755 genes, group no diff"
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 lignes> (ou 'row.names' de longueur nulle)

Expression

expression <- data.frame(counts)
expression <- (expression-rowMeans(expression)) / matrixStats::rowSds(as.matrix(expression))

Heatmap(expression, cluster_columns = F, show_row_names = F)+
  rowAnnotation(
    clusters_rf = clusters_rf[rownames(expression)],
    clusters_lasso = clusters_lasso[rownames(true_mse)],
    col=list(clusters_lasso= setNames(c( "darkgreen","darkorange", "lightgrey"), 
                                       nm = names(table(clusters_lasso))),
      clusters_rf= setNames(c("darkorange", "darkgreen", "lightgrey"), 
                                       nm = names(table(clusters_rf)))))

intersection of groups

load("results/clusters_mse_bRF_100permutations.rdata")
load("results/clusters_mse_lasso_100permutations.rdata")
DIANE::draw_venn(list("positive for lasso" = names(clusters_lasso[clusters_lasso==1]),
                      "positive for RFs" = names(clusters_rf[clusters_rf==2])))

DIANE::draw_venn(list("negative for lasso" = names(clusters_lasso[clusters_lasso==2]),
                      "negative for RFs" = names(clusters_rf[clusters_rf==1])))

DIANE::draw_venn(list("no diff for lasso" = names(clusters_lasso[clusters_lasso=="no diff"]),
                      "no diff for RFs" = names(clusters_rf[clusters_rf=="no diff"])))